library(phyloseq); packageVersion("phyloseq")
## [1] '1.46.0'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.5.1'
library(readr); packageVersion("readr")
## [1] '2.1.5'
library(tidyr); packageVersion("tidyr")
## [1] '1.3.1'
library(purrr); packageVersion("purrr")
## [1] '1.0.2'
library(furrr); packageVersion("furrr")
## Loading required package: future
## Warning: package 'future' was built under R version 4.3.3
## [1] '0.3.1'
library(dplyr); packageVersion("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## [1] '1.1.4'
library(stringr); packageVersion("stringr")
## [1] '1.5.1'
library(forcats); packageVersion("forcats")
## [1] '1.0.0'
library(metacoder); packageVersion("metacoder")
## This is metacoder version 0.3.7 (stable)
##
## Attaching package: 'metacoder'
## The following object is masked from 'package:ggplot2':
##
## map_data
## The following object is masked from 'package:phyloseq':
##
## filter_taxa
## [1] '0.3.7'
library(data.table); packageVersion("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## [1] '1.15.4'
library(decontam); packageVersion("decontam")
## [1] '1.22.0'
library(Biostrings); packageVersion("Biostrings")
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:data.table':
##
## first, second
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:metacoder':
##
## reverse
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## The following object is masked from 'package:phyloseq':
##
## distance
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.3.3
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:metacoder':
##
## complement
## The following object is masked from 'package:base':
##
## strsplit
## [1] '2.70.3'
library(magick); packageVersion("magick")
## Warning: package 'magick' was built under R version 4.3.3
## Linking to ImageMagick 6.9.12.93
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
## [1] '2.8.4'
library(vegan); packageVersion("vegan")
## Warning: package 'vegan' was built under R version 4.3.3
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-6.1
## [1] '2.6.6.1'
library(pdftools);packageVersion("pdftools")
## Warning: package 'pdftools' was built under R version 4.3.3
## Using poppler version 23.04.0
## [1] '3.4.1'
library(grid)
##
## Attaching package: 'grid'
## The following object is masked from 'package:Biostrings':
##
## pattern
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
knitr::opts_knit$set(root.dir = "~/benchmark_demulticoder")
seed <- 1
set.seed(seed)
asv_matrix_rps10<-read.csv("demulticoder/data/final_asv_abundance_matrix_rps10.csv")
asv_matrix_rps10$dada2_tax <- asv_matrix_rps10$dada2_tax <- gsub("Stramenopila", "Eukaryota--100--Domain;Stramenopila", asv_matrix_rps10$dada2_tax)
asv_matrix_rps10 <- asv_matrix_rps10[, -1]
colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)] <- gsub("_rps10$", "", colnames(asv_matrix_rps10)[3:ncol(asv_matrix_rps10)])
asv_matrix_its<-read.csv("demulticoder/data/final_asv_abundance_matrix_its.csv")
asv_matrix_its$dada2_tax <- gsub("Fungi", "Eukaryota--100--Domain;Fungi", asv_matrix_its$dada2_tax)
asv_matrix_its <- asv_matrix_its[, -1]
colnames(asv_matrix_its)[3:ncol(asv_matrix_rps10)] <- gsub("_its$", "", colnames(asv_matrix_its)[3:ncol(asv_matrix_its)])
#Let's combine these matrices
#For easier analysis, we previously combined the two matrices, and appended domain info to each one so we can make one heat tree for combined dataset
sample_cols_its <- setdiff(names(asv_matrix_its), c("sequence", "dada2_tax"))
sample_cols_rps10 <- setdiff(names(asv_matrix_rps10), c("sequence", "dada2_tax"))
if(!all(sample_cols_its == sample_cols_rps10)) {
stop("Sample columns do not match between ITS and RPS10 dataframes!")
}
abundance <- rbind(
asv_matrix_its[, c("sequence", "dada2_tax", sample_cols_its)], # ITS data
asv_matrix_rps10[, c("sequence", "dada2_tax", sample_cols_rps10)] # RPS10 data
)
write.csv(abundance, "demulticoder/data/final_asv_abundance_matrix_combined_demulticoder.csv")
metadata_path <- file.path("demulticoder/data/metadata.csv")
metadata <- read_csv(metadata_path)
## Rows: 174 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_name, well, organism, sample_type
## dbl (3): plate, path_conc, experiment
## lgl (2): flooded, is_ambiguous
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(metadata)
## # A tibble: 174 × 9
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <chr> <lgl> <dbl> <dbl> <chr>
## 1 SDNA_C1 1 G02 <NA> NA NA NA Negative contr…
## 2 SDNA_C2 1 H07 <NA> NA NA NA Negative contr…
## 3 SDNA_C3 2 E02 <NA> NA NA NA Negative contr…
## 4 SDNA_C4 2 A07 <NA> NA NA NA Negative contr…
## 5 SMC1 1 G12 <NA> NA NA NA Mock community
## 6 SMC2 1 H12 <NA> NA NA NA Mock community
## 7 SMC3 2 C11 <NA> NA NA NA Mock community
## 8 SMC4 2 D11 <NA> NA NA NA Mock community
## 9 SPCR_C1 1 D05 <NA> NA NA NA Negative contr…
## 10 SPCR_C2 1 C10 <NA> NA NA NA Negative contr…
## # ℹ 164 more rows
## # ℹ 1 more variable: is_ambiguous <lgl>
#Fourth, let's track reads
track_reads_demulticoder_its<-read.csv("demulticoder/data/track_reads_its.csv", row.names = 1)
track_reads_demulticoder_rps10<-read.csv("demulticoder/data/track_reads_rps10.csv", row.names = 1)
its_summary_reads<-summary(track_reads_demulticoder_its)
print(its_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 82 Min. : 1 Min. : 1 Min. : 1
## 1st Qu.: 23440 1st Qu.:12671 1st Qu.:12375 1st Qu.:12308
## Median : 37507 Median :23008 Median :22684 Median :22513
## Mean : 42989 Mean :25466 Mean :25212 Mean :25078
## 3rd Qu.: 57640 3rd Qu.:31912 3rd Qu.:31614 3rd Qu.:31508
## Max. :136509 Max. :87421 Max. :87247 Max. :87170
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.:11794 1st Qu.:11794
## Median :21770 Median :21770
## Mean :24322 Mean :24230
## 3rd Qu.:30406 3rd Qu.:30050
## Max. :86880 Max. :86868
rps10_summary_reads<-summary(track_reads_demulticoder_rps10)
print(rps10_summary_reads)
## input filtered denoisedF denoisedR
## Min. : 91 Min. : 26 Min. : 7 Min. : 6
## 1st Qu.: 37994 1st Qu.: 29199 1st Qu.: 29186 1st Qu.: 29184
## Median : 61070 Median : 48085 Median : 47950 Median : 48060
## Mean : 73511 Mean : 56500 Mean : 56435 Mean : 56433
## 3rd Qu.: 95487 3rd Qu.: 74825 3rd Qu.: 74408 3rd Qu.: 74436
## Max. :263101 Max. :219382 Max. :218997 Max. :219291
## merged nonchim
## Min. : 0 Min. : 0
## 1st Qu.: 29172 1st Qu.: 29172
## Median : 47653 Median : 47653
## Mean : 56030 Mean : 55349
## 3rd Qu.: 73485 3rd Qu.: 73400
## Max. :218521 Max. :217372
# Function to extract taxonomic levels and count unique entries with bootstrap filtering
# Function to extract taxonomic levels and count unique entries
analyze_taxonomy <- function(data) {
# Get taxonomy data
tax_data <- data$dada2_tax
# Split taxonomy string into components
tax_splits <- strsplit(tax_data, ";")
# Function to safely extract taxonomic level with proper rank checking
extract_tax_level <- function(tax_array, rank) {
# Find the position containing the rank
rank_pos <- grep(rank, tax_array)
if(length(rank_pos) > 0) {
# Extract and clean the taxonomy name
tax <- tax_array[rank_pos]
# Remove the rank pattern and any leading/trailing whitespace
cleaned_tax <- trimws(gsub(paste0("--\\d+--", rank), "", tax))
return(cleaned_tax)
}
return(NA)
}
# Extract each taxonomic level with proper hierarchy checking
families <- sapply(tax_splits, function(x) extract_tax_level(x, "Family"))
genera <- sapply(tax_splits, function(x) extract_tax_level(x, "Genus"))
species <- sapply(tax_splits, function(x) extract_tax_level(x, "Species"))
# Create a data frame to check hierarchy consistency
tax_df <- data.frame(
Family = families,
Genus = genera,
Species = species,
stringsAsFactors = FALSE
)
# Remove NA values before counting
families <- families[!is.na(families)]
genera <- genera[!is.na(genera)]
species <- species[!is.na(species)]
# Count unique entries
unique_counts <- list(
families = length(unique(families)),
genera = length(unique(genera)),
species = length(unique(species))
)
# Get prevalence with hierarchy checking
family_prev <- table(families)
genus_prev <- table(genera)
species_prev <- table(species)
# Sort prevalence tables
family_prev <- sort(family_prev, decreasing = TRUE)
genus_prev <- sort(genus_prev, decreasing = TRUE)
species_prev <- sort(species_prev, decreasing = TRUE)
# Print top 5 of each level for verification
cat("\nTop 5 most prevalent Families:\n")
print(head(family_prev, 5))
cat("\nTop 5 most prevalent Genera:\n")
print(head(genus_prev, 5))
cat("\nTop 5 most prevalent Species:\n")
print(head(species_prev, 5))
# Get most prevalent taxa with hierarchy verification
most_prevalent <- list(
family = if(length(family_prev) > 0) names(family_prev)[1] else "None found",
genus = if(length(genus_prev) > 0) names(genus_prev)[1] else "None found",
species = if(length(species_prev) > 0) names(species_prev)[1] else "None found"
)
return(list(
unique_counts = unique_counts,
most_prevalent = most_prevalent,
tax_df = tax_df # Return full taxonomy dataframe for inspection
))
}
# Run analysis for both datasets
if(exists("asv_matrix_rps10")) {
rps10_results <- analyze_taxonomy(asv_matrix_rps10)
}
##
## Top 5 most prevalent Families:
## families
## Pythiaceae Peronosporaceae Saprolegniaceae Saplolegniaceae Lagenidiaceae
## 171 126 20 10 9
##
## Top 5 most prevalent Genera:
## genera
## Pythium Phytophthora Aphanomyces Peronospora Saprolegnia
## 152 96 18 16 11
##
## Top 5 most prevalent Species:
## species
## Phytophthora_sp.kununurra Phytophthora_citrophthora Pythium_lutarium
## 23 19 19
## Pythium_dissotocum Phytophthora_sojae
## 14 13
if(exists("asv_matrix_its")) {
its_results <- analyze_taxonomy(asv_matrix_its)
}
##
## Top 5 most prevalent Families:
## families
## Fungi_fam_Incertae_sedis Serendipitaceae
## 366 200
## Rozellomycota_fam_Incertae_sedis Hyaloscyphaceae
## 176 157
## Tuberaceae
## 128
##
## Top 5 most prevalent Genera:
## genera
## Fungi_gen_Incertae_sedis Serendipita
## 366 196
## Rozellomycota_gen_Incertae_sedis Tuber
## 176 128
## Hyaloscypha
## 94
##
## Top 5 most prevalent Species:
## species
## Fungi_sp Serendipita_sp Rozellomycota_sp
## 366 190 176
## Tuber_pseudobrumale Archaeorhizomyces_sp
## 128 48
First we will configure the combined taxmap object
#load reference database
rps10_database <- read_fasta("demulticoder/data/rps10_reference_db.fa")
innoc_species <- c(Cin = "Phytophthora_cinnamomi", Plu = "Phytophthora_plurivora", Cry = "Pythium_cryptoirregulare")
innoc_regex <- paste0('(', paste0(innoc_species, collapse = '|'), ')')
innoc_seqs <- rps10_database[grepl(names(rps10_database), pattern = innoc_regex)]
innoc_seqs <- innoc_seqs[! duplicated(innoc_seqs)]
names(innoc_seqs) <- str_match(names(innoc_seqs), pattern = innoc_regex)[, 2]
iupac_match <- function(asv_chars, ref_chars) {
map2_lgl(asv_chars, ref_chars, function(asv, ref) grepl(asv, pattern = paste0('[', IUPAC_CODE_MAP[ref], ']+')))
}
# Count number of mismatches in an alignment, allowing for IUPAC codes in reference
align_mismatch <- function(alignment) {
asv_chars <- strsplit(as.character(alignment@pattern), '')[[1]]
ref_chars <- strsplit(as.character(alignment@subject), '')[[1]]
sum(! iupac_match(asv_chars, ref_chars))
}
# Align each sequence to each asv and make table of results"
aligned_data_path <- file.path("demulticoder/data", "infection_aligned_data.rds")
if (file.exists(aligned_data_path)) {
aligned_data <- readRDS(aligned_data_path)
} else {
aligned_data <- future_map_dfr(seq_along(innoc_seqs), function(i) {
aligned <- lapply(abundance$sequence, function(asv) pairwiseAlignment(pattern = asv, subject = innoc_seqs[i], type = 'global-local'))
print(paste("Processing species:", names(innoc_seqs)[i]))
tibble(
species = names(innoc_seqs)[i],
align_len = map_dbl(aligned, nchar),
mismatch = map_dbl(aligned, align_mismatch),
pid = (align_len - mismatch) / align_len,
asv_seq = abundance$sequence,
ref_seq = innoc_seqs[i],
alignment = aligned
)
})
saveRDS(aligned_data, file = aligned_data_path)
}
# Convert ASV abundances to proportions for use with matches
abundance_prop <- abundance
abundance_prop[metadata$sample_name] <- lapply(abundance_prop[metadata$sample_name], function(counts) counts/sum(counts))
innoculum_asv_mismatch_threshold <-1
# Sum the read counts of ASVs representing the same species used for inoculation
infection_data <- aligned_data %>%
filter(mismatch <= innoculum_asv_mismatch_threshold) %>%
left_join(abundance_prop, by = c(asv_seq = "sequence")) %>%
group_by(asv_seq) %>% slice_min(mismatch, with_ties = FALSE) # Only consider the best match per ASV
## Update abundance matrix and metadata with results
inoculum_asv_key <- setNames(infection_data$species, infection_data$asv_seq)
abundance <- mutate(abundance, is_inoculum = sequence %in% infection_data$asv_seq, inoculum = inoculum_asv_key[sequence], .after = "dada2_tax")
write_csv(abundance, file.path('demulticoder/data', 'abundance_with_infection_data.csv'))
#Let's append metadata table with more info
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
# Get per-sample proportions of reads
species_name_key <- c(Phytophthora_cinnamomi = "cin_prop", Phytophthora_plurivora = "plu_prop", Pythium_cryptoirregulare = "cry_prop")
sample_infection_data <- infection_data %>%
group_by(species) %>% summarise_at(metadata$sample_name, sum) %>% # Sum per inoculum species
tidyr::gather(key = "sample_name", value = "read_prop", !!! metadata$sample_name) %>%
mutate(species = species_name_key[species]) %>%
mutate(read_prop = ifelse(is.nan(read_prop), 0, read_prop)) %>%
tidyr::spread(key = 'species', value = 'read_prop')
metadata <- metadata %>%
select(sample_name, plate, well, organism, flooded, path_conc, experiment, sample_type, is_ambiguous) %>% # Avoid duplicating columns from previous runs
left_join(sample_infection_data, by = "sample_name")
# Get expected inoculum proportion and non-target proportion
metadata$expected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(NA)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]])
}
})
metadata$unexpected_innoc_prop <- map_dbl(1:nrow(metadata), function(i) {
prop_cols <- c("cin_prop", "cry_prop", "plu_prop")
if (metadata$sample_type[i] == "Mock community" || metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(sum(metadata[i, prop_cols]))
} else {
expected_prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
prop_cols <- prop_cols[! prop_cols %in% expected_prop_col]
return(sum(metadata[i, prop_cols]))
}
})
# Check if expected inoculum was found
metadata$expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0)
}
})
# Check if ONLY expected inoculum was found
metadata$only_expected_innoc <- map_lgl(1:nrow(metadata), function(i) {
all_innoc <- sum(metadata[i, c('cin_prop', 'cry_prop', 'plu_prop')])
if (metadata$sample_type[i] == "Mock community") {
return(NA)
} else if (metadata$sample_type[i] == "Negative control" || metadata$organism[i] == "Control") {
return(all_innoc == 0)
} else {
prop_col <- paste0(tolower(metadata$organism[i]), '_prop')
return(metadata[[i, prop_col]] > 0 && metadata[[i, prop_col]] == all_innoc)
}
})
# Look at number of samples with expected inoculum
table(metadata$expected_innoc)
##
## FALSE TRUE
## 88 82
table(metadata$only_expected_innoc)
##
## FALSE TRUE
## 107 63
metadata$valid_inoc <- (is.na(metadata$expected_innoc_prop) | metadata$expected_innoc_prop > 0.0001) & metadata$unexpected_innoc_prop < 0.0001
map_dbl(split(metadata$valid_inoc, metadata$organism), sum)
## Cin Control Cry Plu
## 26 9 16 8
# Save new metadata
write_csv(metadata, file.path('demulticoder/data', 'metadata_with_infection_data.csv'))
metadata <- filter(metadata, ! is_ambiguous)
abundance <- filter(abundance, ! is_inoculum)
#Filter out mock community samples and also any mock community samples as well
metadata <- filter(metadata, sample_type != "Mock community" & sample_type != "Negative control")
obj <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj$data) <- c('abund', 'score')
obj <- transmute_obs(obj, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
# For diversity calculations, we'll use proportions of read depth
# We'll set any proportion to 0 that is less than the inverse of the read count of the non-control sample with the fewest reads.
# This should account for unequal sample read depth without the randomness of rarefaction.
metadata$raw_count <- colSums(obj$data$abund[, metadata$sample_name])
lowest_count <- min(metadata$raw_count[! is.na(metadata$organism)])
lowest_count
## [1] 12676
obj$data$prop <- calc_obs_props(obj, data = 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj$data$prop <- zero_low_counts(obj, data = 'prop', min_count = 1 / lowest_count, cols = metadata$sample_name)
## Zeroing 2166 of 438534 counts less than 7.88892395077311e-05.
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S101 S103 S104 S106 S11 S111 S113 S114
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0396 0.00190 0.000863 0.00224 0.0414 0 0.00941 3.19e-3
## 2 bgb 0.00757 0.0295 0.0311 0.0237 0.0643 0.0559 0.00866 1.99e-2
## 3 bgc 0.0316 0 0 0 0 0 0 0
## 4 bgd 0.0694 0.000426 0.0196 0.0113 0.0314 0.00204 0 0
## 5 bge 0.0121 0.00728 0.0241 0.00656 0.00528 0.0268 0.0231 4.98e-2
## 6 bgf 0.0190 0.0162 0.00525 0.0428 0.00153 0.0259 0.0238 6.63e-2
## 7 bgg 0.000188 0.000455 0.00123 0.000232 0.000208 0.00305 0.00118 7.82e-4
## 8 bgh 0.00110 0.00250 0.00602 0.000290 0.00813 0.0232 0.00115 4.36e-3
## 9 bgc 0 0 0 0 0.188 0 0 0
## 10 bgc 0 0 0 0 0.183 0 0 0
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S115 <dbl>, S118 <dbl>, S120 <dbl>, S122 <dbl>,
## # S125 <dbl>, S127 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S140 <dbl>,
## # S142 <dbl>, S143 <dbl>, S144 <dbl>, S145 <dbl>, S146 <dbl>, S147 <dbl>,
## # S148 <dbl>, S149 <dbl>, S15 <dbl>, S154 <dbl>, S156 <dbl>, S157 <dbl>,
## # S159 <dbl>, S16 <dbl>, S160 <dbl>, S161 <dbl>, S162 <dbl>, S165 <dbl>,
## # S166 <dbl>, S169 <dbl>, S2 <dbl>, S20 <dbl>, S21 <dbl>, S24 <dbl>, …
obj$data$prop[metadata$sample_name] <- map(metadata$sample_name, function(id) {
out <- obj$data$prop[[id]]
out[is.na(out) | is.nan(out)] <- 0
out
})
obj$data$prop
## # A tibble: 2,707 × 163
## taxon_id S101 S103 S104 S106 S11 S111 S113 S114
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 bga 0.0396 0.00190 0.000863 0.00224 0.0414 0 0.00941 3.19e-3
## 2 bgb 0.00757 0.0295 0.0311 0.0237 0.0643 0.0559 0.00866 1.99e-2
## 3 bgc 0.0316 0 0 0 0 0 0 0
## 4 bgd 0.0694 0.000426 0.0196 0.0113 0.0314 0.00204 0 0
## 5 bge 0.0121 0.00728 0.0241 0.00656 0.00528 0.0268 0.0231 4.98e-2
## 6 bgf 0.0190 0.0162 0.00525 0.0428 0.00153 0.0259 0.0238 6.63e-2
## 7 bgg 0.000188 0.000455 0.00123 0.000232 0.000208 0.00305 0.00118 7.82e-4
## 8 bgh 0.00110 0.00250 0.00602 0.000290 0.00813 0.0232 0.00115 4.36e-3
## 9 bgc 0 0 0 0 0.188 0 0 0
## 10 bgc 0 0 0 0 0.183 0 0 0
## # ℹ 2,697 more rows
## # ℹ 154 more variables: S115 <dbl>, S118 <dbl>, S120 <dbl>, S122 <dbl>,
## # S125 <dbl>, S127 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>, S140 <dbl>,
## # S142 <dbl>, S143 <dbl>, S144 <dbl>, S145 <dbl>, S146 <dbl>, S147 <dbl>,
## # S148 <dbl>, S149 <dbl>, S15 <dbl>, S154 <dbl>, S156 <dbl>, S157 <dbl>,
## # S159 <dbl>, S16 <dbl>, S160 <dbl>, S161 <dbl>, S162 <dbl>, S165 <dbl>,
## # S166 <dbl>, S169 <dbl>, S2 <dbl>, S20 <dbl>, S21 <dbl>, S24 <dbl>, …
#Let's modify the metadata sheet
metadata <- metadata %>%
mutate(organism = fct_relevel(ordered(organism), "Control", "Cin", "Plu", "Cry"))
abund_table <- obj$data$abund[metadata$sample_name]
metadata$richness <- vegan::specnumber(abund_table, MARGIN = 2)
metadata$shannon <- vegan::diversity(abund_table, MARGIN = 2, index = "shannon")
metadata$invsimpson <- vegan::diversity(abund_table, MARGIN = 2, index = "invsimpson")
write.csv(metadata, file.path("demulticoder", "results/alpha_diversity.csv"))
print(metadata)
## # A tibble: 162 × 21
## sample_name plate well organism flooded path_conc experiment sample_type
## <chr> <dbl> <chr> <ord> <lgl> <dbl> <dbl> <chr>
## 1 S101 2 C02 Plu FALSE 1 2 Sample
## 2 S103 2 F02 Cin FALSE 1 2 Sample
## 3 S104 2 G02 Plu FALSE 1 2 Sample
## 4 S106 2 A03 Plu FALSE 100 2 Sample
## 5 S11 1 C02 Plu FALSE 1 1 Sample
## 6 S111 2 F03 Cin FALSE 100 2 Sample
## 7 S113 2 H03 Cry FALSE 1 2 Sample
## 8 S114 2 A04 Plu FALSE 100 2 Sample
## 9 S115 2 B04 Control FALSE 0 2 Sample
## 10 S118 2 F04 Cry FALSE 100 2 Sample
## # ℹ 152 more rows
## # ℹ 13 more variables: is_ambiguous <lgl>, cin_prop <dbl>, cry_prop <dbl>,
## # plu_prop <dbl>, expected_innoc_prop <dbl>, unexpected_innoc_prop <dbl>,
## # expected_innoc <lgl>, only_expected_innoc <lgl>, valid_inoc <lgl>,
## # raw_count <dbl>, richness <int>, shannon <dbl>, invsimpson <dbl>
plotted_factors <- c('Organism' = 'organism', 'Flooded' = 'flooded', 'Pathogen Concentration' = 'path_conc', 'Trial' = 'experiment')
# Reformat data for plotting
alpha_plot_data <- plotted_factors %>%
map2_dfr(names(plotted_factors),
function(factor, factor_name) {
out <- metadata
out$factor <- factor_name
out$value <- as.character(metadata[[factor]])
return(out)
}) %>%
mutate(path_conc = factor(path_conc,
levels = sort(unique(path_conc)),
labels = paste(sort(unique(path_conc)), 'CFU/g'),
ordered = TRUE)) %>%
filter(sample_type == 'Sample') %>%
select(sample_name, factor, value, invsimpson) %>%
tidyr::gather(key = "index", value = "diversity", -sample_name, -factor, -value) %>%
mutate(value = forcats::fct_relevel(ordered(value), "Control", "Cin", "Plu", "Cry"))
# ANOVA and Tukey's HSD
anova_and_hsd <- function(x) {
anova_result <- aov(diversity ~ value, x)
tukey_result <- agricolae::HSD.test(anova_result, "value", group = TRUE)
group_data <- tukey_result$groups[order(rownames(tukey_result$groups)),]
group_key <- setNames(group_data$groups, rownames(group_data))
group_key[as.character(x$value)]
}
alpha_plot_data$group <- unlist(map(split(alpha_plot_data, alpha_plot_data$factor)[unique(alpha_plot_data$factor)], anova_and_hsd))
alpha_subplot <- ggplot(alpha_plot_data, aes(x = value, y = diversity)) +
geom_boxplot() +
geom_text(aes(x = value,
y = max(diversity) + 2,
label = group),
col = 'black',
size = 5) +
facet_grid( ~ factor, scales = "free") +
labs(x = NULL, y = 'Diversity (Inverse Simpson)') +
guides(color = "none") +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position="bottom")
set.seed(1)
prob_table <- obj$data$prop[metadata$sample_name]
nmds_plot_data <- function(prob_table) {
metadata <- metadata[metadata$sample_name %in% colnames(prob_table), ]
set.seed(1)
nmds_results <- vegan::metaMDS(t(prob_table), trymax = 1000, k = 2, trace = 0)
nmds_data <- nmds_results$points %>%
as_tibble() %>%
bind_cols(metadata)
names(nmds_data)[1:2] <- paste0("NMDS", 1:2)
return(nmds_data)
}
nmds_data <- nmds_plot_data(prob_table[! is.na(metadata$organism) & metadata$valid_inoc])
nmds_factors <- c(Flooded = 'flooded', Organism = 'organism', 'Pathogen CFU/g' = 'path_conc', 'Trial' = 'experiment')
make_one_plot <- function(factor, name) {
nmds_data %>%
mutate(factor = as.character(nmds_data[[factor]]),
NMDS1 = scales::rescale(NMDS1),
NMDS2 = scales::rescale(NMDS2)) %>%
mutate(factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu", "Cry")) %>%
ggplot(aes_string(x = "NMDS1", y = "NMDS2", color = "factor", label = "sample_name")) +
# geom_label(size = 2) +
geom_point() +
coord_fixed() +
viridis::scale_color_viridis(discrete = TRUE, end = .9) +
labs(color = NULL, x = NULL, y = NULL) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text = element_blank(),
axis.title = element_text(size = 7),
axis.ticks = element_blank(),
plot.margin = unit(rep(0.04, 4), "cm"),
# panel.background = element_rect(fill = 'transparent', colour = NA),
# plot.background = element_rect(fill = "white", colour = NA),
legend.position = "bottom",
legend.text = element_text(size = 10),
legend.key.height = unit(0.5, 'cm'),
legend.key.width = unit(0.2, 'cm'))
}
nmds_subplots <- map2(nmds_factors, names(nmds_factors), make_one_plot)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
## There was 1 warning in `mutate()`.
## ℹ In argument: `factor = fct_relevel(ordered(factor), "Control", "Cin", "Plu",
## "Cry")`.
## Caused by warning:
## ! 4 unknown levels in `f`: Control, Cin, Plu, and Cry
nmds_plot <- ggpubr::ggarrange(plotlist = c(list(ggplot() + theme_void()), nmds_subplots),
nrow = 1, widths = c(0.15, 1, 1, 1, 1))
ggsave(nmds_plot, path = "demulticoder/figures", filename = "nmds.pdf",
width = 7, height = 8)
print(nmds_plot)
combined_div_plot <- ggpubr::ggarrange(alpha_subplot, nmds_plot, ncol = 1, labels = c('A', 'B'),
heights = c(1, 1))
combined_div_plot
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.pdf",
width = 10, height = 6, bg = "#FFFFFF")
ggsave(combined_div_plot, path = "demulticoder/figures", filename = "diversity.svg",
width = 10, height = 6, bg = "#FFFFFF")
dist_matrix <- vegdist(t(prob_table), method = "bray")
adonis2_full <- adonis2(dist_matrix ~ organism + flooded + path_conc + experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_full)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism + flooded + path_conc + experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.3900 0.061 .
## flooded 1 1.848 0.03514 6.1469 0.001 ***
## path_conc 1 0.276 0.00524 0.9165 0.509
## experiment 1 2.614 0.04971 8.6953 0.001 ***
## Residual 155 46.598 0.88608
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_org <- adonis2(dist_matrix ~ organism,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_org)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ organism, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## organism 3 1.254 0.02384 1.2861 0.107
## Residual 158 51.336 0.97616
## Total 161 52.590 1.00000
adonis2_flooded <- adonis2(dist_matrix ~ flooded,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_flooded)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ flooded, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## flooded 1 1.849 0.03515 5.8294 0.001 ***
## Residual 160 50.741 0.96485
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2_path_conc <- adonis2(dist_matrix ~ path_conc,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_path_conc)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ path_conc, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## path_conc 1 0.308 0.00586 0.9431 0.474
## Residual 160 52.281 0.99414
## Total 161 52.590 1.00000
adonis2_experiment <- adonis2(dist_matrix ~ experiment,
data = metadata,
permutations = 999,
method = "bray")
print(adonis2_experiment)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dist_matrix ~ experiment, data = metadata, permutations = 999, method = "bray")
## Df SumOfSqs R2 F Pr(>F)
## experiment 1 2.613 0.04969 8.3655 0.001 ***
## Residual 160 49.977 0.95031
## Total 161 52.590 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Make sure our factors are input properly as factors
metadata$path_conc <- as.factor(metadata$path_conc)
metadata$organism <- as.factor(metadata$organism)
metadata$flooded <- as.factor(metadata$flooded)
metadata$experiment <- as.factor(metadata$experiment)
abundance$is_low_abund <- rowSums(abundance[, metadata$sample_name]) < 5
obj2 <- parse_tax_data(abundance, class_cols = 'dada2_tax', class_sep = ';',
class_regex = '^(.+)--(.+)--(.+)$',
class_key = c(taxon = 'taxon_name', boot = 'info', rank = 'taxon_rank'))
names(obj2$data) <- c('abund', 'score')
obj2 <- transmute_obs(obj2, 'score', sequence = sequence[input_index], boot = boot, rank = rank)
print(obj2)
## <Taxmap>
## 1573 taxa: aab. Eukaryota, aac. Fungi ... cin. Fungi_sp
## 1573 edges: NA->aab, aab->aac, aab->aad ... cil->cim, cim->cin
## 2 data sets:
## abund:
## # A tibble: 2,707 × 180
## taxon_id sequence dada2_tax is_inoculum inoculum S1 S10
## <chr> <chr> <chr> <lgl> <chr> <int> <int>
## 1 bga AAGTCGTAA… Eukaryot… FALSE <NA> 4670 3565
## 2 bgb AAGTCGTAA… Eukaryot… FALSE <NA> 493 668
## 3 bgc AAAAAGTCG… Eukaryot… FALSE <NA> 7603 0
## # ℹ 2,704 more rows
## # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
## # S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
## # S108 <int>, S109 <int>, …
## score:
## # A tibble: 23,852 × 4
## taxon_id sequence boot rank
## <chr> <chr> <chr> <chr>
## 1 aab AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 Doma…
## 2 aac AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 King…
## 3 aae AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73 Phyl…
## # ℹ 23,849 more rows
## 0 functions:
obj2$data$tax_abund <- calc_taxon_abund(obj2, data = "abund", cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1573 taxa
obj2$data$abund_prop <- calc_obs_props(obj2, data = "abund", cols = metadata$sample_name, groups = rep("tax_prop", nrow(metadata)))
## Calculating proportions from counts for 162 columns in 1 groups for 2707 observations.
obj2$data$tax_prop <- calc_taxon_abund(obj2, data = "abund_prop", cols = "tax_prop")
## Summing per-taxon counts from 1 columns for 1573 taxa
obj2$data$abund_prop <- NULL
obj_subset <- obj2 %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
min_bootstrap <- 0
obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
## [1] 2.34049e-11 1.00000e+00
## [1] -7.478532 29.091700
## [1] 1.295288e-22 9.957599e-01
## [1] -5.153382 25.613136
## [1] 7.212727e-51 1.000000e+00
## [1] -43.59079 42.09414
## [1] 2.011151e-190 9.952921e-01
## [1] -25.83220 26.94927
## quartz_off_screen
## 2
obj_subset <- obj2 %>%
filter_taxa(taxon_ranks == "Species", supertaxa = TRUE, reassign_obs = c(tax_abund = FALSE))
min_bootstrap <- 60
obj_subset$data$score$boot <- as.numeric(obj_subset$data$score$boot)
max_boot <- obj_subset$data$score %>%
group_by(taxon_id) %>%
summarise(max = max(boot))
max_boot <- setNames(max_boot$max, max_boot$taxon_id)
obj_subset <- filter_taxa(obj_subset, max_boot[taxon_ids] >= min_bootstrap | taxon_ranks %in% c("ASV", "Reference"), reassign_obs = c(abund = TRUE, score = FALSE))
obj_subset$data$asv_prop <- calc_obs_props(obj_subset, 'abund', cols = metadata$sample_name)
## Calculating proportions from counts for 162 columns for 2707 observations.
obj_subset$data$tax_abund <- calc_taxon_abund(obj_subset, 'abund', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1130 taxa
obj_subset$data$tax_prop <- calc_taxon_abund(obj_subset, 'asv_prop', cols = metadata$sample_name)
## Summing per-taxon counts from 162 columns for 1130 taxa
obj_subset$data$tax_data <- calc_n_samples(obj_subset, 'tax_prop', cols = metadata$sample_name[metadata$sample_type == 'Sample'])
## Calculating number of samples with a value greater than 0 for 162 columns for 1130 observations
obj_subset$data$tax_data$mean_prop <- rowMeans(obj_subset$data$tax_prop[, metadata$sample_name])
# Calculate the relative standard deviation for each taxon as a measure of how consistently it was found.
rsd <- function(x, na.rm = FALSE) {sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)}
obj_subset$data$tax_data$rel_stand_dev <- map_dbl(1:nrow(obj_subset$data$tax_abund), function(i) {
rsd(unlist(obj_subset$data$tax_abund[i, metadata$sample_name]), na.rm = TRUE)
})
obj_subset
## <Taxmap>
## 1130 taxa: aab. Eukaryota, aac. Fungi ... cin. Fungi_sp
## 1130 edges: NA->aab, aab->aac, aab->aad ... cil->cim, cim->cin
## 6 data sets:
## abund:
## # A tibble: 2,707 × 180
## taxon_id sequence dada2_tax is_inoculum inoculum S1 S10
## <chr> <chr> <chr> <lgl> <chr> <int> <int>
## 1 acm AAGTCGTAA… Eukaryot… FALSE <NA> 4670 3565
## 2 bgb AAGTCGTAA… Eukaryot… FALSE <NA> 493 668
## 3 bgc AAAAAGTCG… Eukaryot… FALSE <NA> 7603 0
## # ℹ 2,704 more rows
## # ℹ 173 more variables: S100 <int>, S101 <int>, S102 <int>,
## # S103 <int>, S104 <int>, S105 <int>, S106 <int>, S107 <int>,
## # S108 <int>, S109 <int>, …
## score:
## # A tibble: 22,959 × 4
## taxon_id sequence boot rank
## <chr> <chr> <dbl> <chr>
## 1 aab AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 Doma…
## 2 aac AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 100 King…
## 3 aae AAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGA… 73 Phyl…
## # ℹ 22,956 more rows
## tax_abund:
## # A tibble: 1,130 × 163
## taxon_id S101 S103 S104 S106 S11 S111 S113 S114 S115
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 aab 42655 70347 52167 51792 28797 44599 54012 61406 55243
## 2 aac 13060 19097 10151 12198 22562 24760 8937 23288 21088
## 3 aad 29595 51250 42016 39594 6235 19839 45075 38118 34155
## # ℹ 1,127 more rows
## # ℹ 153 more variables: S118 <dbl>, S120 <dbl>, S122 <dbl>,
## # S125 <dbl>, S127 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>,
## # S140 <dbl>, S142 <dbl>, …
## tax_prop:
## # A tibble: 1,130 × 163
## taxon_id S101 S103 S104 S106 S11 S111 S113 S114 S115
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 aab 1 1 1 1 1 1 1 1 1
## 2 aac 0.306 0.271 0.195 0.236 0.783 0.555 0.165 0.379 0.382
## 3 aad 0.694 0.729 0.805 0.764 0.217 0.445 0.835 0.621 0.618
## # ℹ 1,127 more rows
## # ℹ 153 more variables: S118 <dbl>, S120 <dbl>, S122 <dbl>,
## # S125 <dbl>, S127 <dbl>, S133 <dbl>, S134 <dbl>, S135 <dbl>,
## # S140 <dbl>, S142 <dbl>, …
## asv_prop:
## # A tibble: 2,707 × 163
## taxon_id S101 S103 S104 S106 S11 S111 S113
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 acm 0.0396 0.00190 8.63e-4 0.00224 0.0414 0 0.00941
## 2 bgb 0.00757 0.0295 3.11e-2 0.0237 0.0643 0.0559 0.00866
## 3 bgc 0.0316 0 0 0 0 0 0
## # ℹ 2,704 more rows
## # ℹ 155 more variables: S114 <dbl>, S115 <dbl>, S118 <dbl>,
## # S120 <dbl>, S122 <dbl>, S125 <dbl>, S127 <dbl>, S133 <dbl>,
## # S134 <dbl>, S135 <dbl>, …
## tax_data:
## # A tibble: 1,130 × 4
## taxon_id n_samples mean_prop rel_stand_dev
## <chr> <int> <dbl> <dbl>
## 1 aab 162 1 0.443
## 2 aac 162 0.416 0.652
## 3 aad 162 0.584 0.711
## # ℹ 1,127 more rows
## 0 functions:
set.seed(1000)
obj_subset %>%
filter_taxa(! is_stem) %>%
filter_taxa(n_samples >= 10, supertaxa = TRUE, reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "_sp$"), reassign_obs = FALSE) %>%
filter_taxa(! grepl(x = taxon_names, "incertae_sedis", ignore.case = TRUE), reassign_obs = FALSE) %>%
remove_redundant_names() %>%
heat_tree(node_size = mean_prop,
edge_size = n_samples,
node_color = ifelse(is.na(rel_stand_dev), 0, rel_stand_dev),
#node_color_range = c("#aaaaaa", "#8da0cb", "#66c2a5", "#a6d854", "#fc8d62", "red"),
node_label = taxon_names,
node_size_range = c(0.008, 0.025),
node_label_size_range = c(0.012, 0.018),
edge_label_size_range = c(0.010, 0.013),
node_size_interval = c(0, 1),
edge_size_range = c(0.001, 0.008),
#layout = "da", initial_layout = "re",
node_color_axis_label = "Relative standard deviation",
node_size_axis_label = "Mean proportion of reads",
edge_size_axis_label = "Number of samples",
node_color_digits = 2,
node_size_digits = 2,
edge_color_digits = 2,
edge_size_digits = 2,
#aspect_ratio = 1.618,
output_file = file.path('demulticoder/figures', '/supp_figure_2_whole_community.pdf'))
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.3.2 (2023-10-31)
## os macOS Ventura 13.2.1
## system aarch64, darwin20
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Los_Angeles
## date 2024-11-22
## pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] CRAN (R 4.3.0)
## ade4 1.7-22 2023-02-06 [1] CRAN (R 4.3.0)
## agricolae 1.3-7 2023-10-22 [1] CRAN (R 4.3.1)
## AlgDesign 1.2.1 2022-05-25 [1] CRAN (R 4.3.0)
## ape 5.8 2024-04-11 [1] CRAN (R 4.3.1)
## askpass 1.2.0 2023-09-03 [1] CRAN (R 4.3.0)
## backports 1.5.0 2024-05-23 [1] CRAN (R 4.3.3)
## Biobase 2.62.0 2023-10-26 [1] Bioconductor
## BiocGenerics * 0.48.1 2023-11-02 [1] Bioconductor
## BiocParallel 1.36.0 2023-10-26 [1] Bioconductor
## biomformat 1.30.0 2023-10-26 [1] Bioconductor
## Biostrings * 2.70.3 2024-04-03 [1] bioc_xgit (@c213e35)
## bit 4.5.0 2024-09-20 [1] CRAN (R 4.3.3)
## bit64 4.5.2 2024-09-22 [1] CRAN (R 4.3.3)
## bitops 1.0-8 2024-07-29 [1] CRAN (R 4.3.3)
## broom 1.0.6 2024-05-17 [1] CRAN (R 4.3.3)
## bslib 0.8.0 2024-07-29 [1] CRAN (R 4.3.3)
## cachem 1.1.0 2024-05-16 [1] CRAN (R 4.3.2)
## car 3.1-2 2023-03-30 [1] CRAN (R 4.3.0)
## carData 3.0-5 2022-01-06 [1] CRAN (R 4.3.0)
## cli 3.6.3 2024-06-21 [1] CRAN (R 4.3.2)
## cluster 2.1.6 2023-12-01 [1] CRAN (R 4.3.1)
## codetools 0.2-20 2024-03-31 [1] CRAN (R 4.3.1)
## colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.3.3)
## cowplot 1.1.3 2024-01-22 [1] CRAN (R 4.3.1)
## crayon 1.5.3 2024-06-20 [1] CRAN (R 4.3.3)
## data.table * 1.15.4 2024-03-30 [1] CRAN (R 4.3.1)
## decontam * 1.22.0 2023-10-26 [1] Bioconductor
## DelayedArray 0.28.0 2023-11-06 [1] Bioconductor
## DESeq2 1.42.1 2024-03-09 [1] Bioconductor 3.18 (R 4.3.3)
## digest 0.6.37 2024-08-19 [1] CRAN (R 4.3.3)
## dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
## evaluate 1.0.1 2024-10-10 [1] CRAN (R 4.3.3)
## fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
## farver 2.1.2 2024-05-13 [1] CRAN (R 4.3.3)
## fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.3.2)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
## foreach 1.5.2 2022-02-02 [1] CRAN (R 4.3.0)
## furrr * 0.3.1 2022-08-15 [1] CRAN (R 4.3.0)
## future * 1.34.0 2024-07-29 [1] CRAN (R 4.3.3)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
## GenomeInfoDb * 1.38.8 2024-03-16 [1] Bioconductor 3.18 (R 4.3.3)
## GenomeInfoDbData 1.2.11 2023-11-14 [1] Bioconductor
## GenomicRanges 1.54.1 2023-10-30 [1] Bioconductor
## ggfittext 0.10.2 2024-02-01 [1] CRAN (R 4.3.1)
## ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
## ggpubr 0.6.0 2023-02-10 [1] CRAN (R 4.3.0)
## ggsignif 0.6.4 2022-10-13 [1] CRAN (R 4.3.0)
## globals 0.16.3 2024-03-08 [1] CRAN (R 4.3.1)
## glue 1.8.0 2024-09-30 [1] CRAN (R 4.3.3)
## gridExtra * 2.3 2017-09-09 [1] CRAN (R 4.3.0)
## gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1)
## highr 0.11 2024-05-26 [1] CRAN (R 4.3.3)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
## htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
## igraph 2.0.3 2024-03-13 [1] CRAN (R 4.3.1)
## IRanges * 2.36.0 2023-10-26 [1] Bioconductor
## iterators 1.0.14 2022-02-05 [1] CRAN (R 4.3.0)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.3.0)
## jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
## knitr 1.48 2024-07-07 [1] CRAN (R 4.3.3)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.3.0)
## lattice * 0.22-6 2024-03-20 [1] CRAN (R 4.3.1)
## lazyeval 0.2.2 2019-03-15 [1] CRAN (R 4.3.0)
## lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
## listenv 0.9.1 2024-01-29 [1] CRAN (R 4.3.1)
## locfit 1.5-9.10 2024-06-24 [1] CRAN (R 4.3.3)
## magick * 2.8.4 2024-07-14 [1] CRAN (R 4.3.3)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
## MASS 7.3-60.0.1 2024-01-13 [1] CRAN (R 4.3.1)
## Matrix 1.6-5 2024-01-11 [1] CRAN (R 4.3.1)
## MatrixGenerics 1.14.0 2023-10-26 [1] Bioconductor
## matrixStats 1.3.0 2024-04-11 [1] CRAN (R 4.3.1)
## metacoder * 0.3.7 2024-02-20 [1] CRAN (R 4.3.1)
## mgcv 1.9-1 2023-12-21 [1] CRAN (R 4.3.1)
## multtest 2.58.0 2023-10-26 [1] Bioconductor
## munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
## nlme 3.1-165 2024-06-06 [1] CRAN (R 4.3.3)
## parallelly 1.38.0 2024-07-27 [1] CRAN (R 4.3.3)
## pdftools * 3.4.1 2024-09-20 [1] CRAN (R 4.3.3)
## permute * 0.9-7 2022-01-27 [1] CRAN (R 4.3.0)
## phyloseq * 1.46.0 2024-04-03 [1] bioc_xgit (@7320133)
## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.3.1)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
## qpdf 1.3.4 2024-10-04 [1] CRAN (R 4.3.3)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
## ragg 1.3.2 2024-05-15 [1] CRAN (R 4.3.3)
## Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.3.2)
## RCurl 1.98-1.16 2024-07-11 [1] CRAN (R 4.3.3)
## readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.1)
## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.3.0)
## rhdf5 2.46.1 2023-12-02 [1] Bioconductor 3.18 (R 4.3.2)
## rhdf5filters 1.14.1 2023-11-06 [1] Bioconductor
## Rhdf5lib 1.24.2 2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
## rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.2)
## rmarkdown 2.27 2024-05-17 [1] CRAN (R 4.3.3)
## rstatix 0.7.2 2023-02-01 [1] CRAN (R 4.3.0)
## rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
## S4Arrays 1.2.1 2024-03-05 [1] Bioconductor 3.18 (R 4.3.2)
## S4Vectors * 0.40.2 2023-11-25 [1] Bioconductor 3.18 (R 4.3.2)
## sass 0.4.9 2024-03-15 [1] CRAN (R 4.3.1)
## scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
## SparseArray 1.2.4 2024-02-10 [1] Bioconductor 3.18 (R 4.3.2)
## stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.2)
## stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
## SummarizedExperiment 1.32.0 2023-11-06 [1] Bioconductor
## survival 3.7-0 2024-06-05 [1] CRAN (R 4.3.3)
## svglite 2.1.3 2023-12-08 [1] CRAN (R 4.3.1)
## systemfonts 1.1.0 2024-05-15 [1] CRAN (R 4.3.3)
## textshaping 0.4.0 2024-05-24 [1] CRAN (R 4.3.3)
## tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
## tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
## tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0)
## utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
## vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
## vegan * 2.6-6.1 2024-05-21 [1] CRAN (R 4.3.3)
## viridis 0.6.5 2024-01-29 [1] CRAN (R 4.3.1)
## viridisLite 0.4.2 2023-05-02 [1] CRAN (R 4.3.0)
## vroom 1.6.5 2023-12-05 [1] CRAN (R 4.3.1)
## withr 3.0.1 2024-07-31 [1] CRAN (R 4.3.2)
## xfun 0.48 2024-10-03 [1] CRAN (R 4.3.3)
## XVector * 0.42.0 2023-10-26 [1] Bioconductor
## yaml 2.3.10 2024-07-26 [1] CRAN (R 4.3.3)
## zlibbioc 1.48.2 2024-03-19 [1] Bioconductor 3.18 (R 4.3.3)
##
## [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────